Data Visualization#

Once upon a time there were plots upon plots upon plots.

Load data#

Hide code cell source
import pandas as pd
import sys
sys.path.append('../')
from source.bokeh_plots import *
from source.data_visualization import *
output_notebook()

file_path = '../data/al_atlas_main_results.xlsx'
model_name = 'AML Epigenomic Risk'

# Read the data
df = pd.read_excel(file_path, index_col=0).sort_index()

# Define train and test samples
df_train = df[df['Train-Test']=='Train Sample']
df_test = df[df['Train-Test'] == 'Test Sample']

# Drop the samples with missing labels for the selected column
df_px = df_train[~df_train['Vital Status'].isna()]

# drop the samples with missing labels for the ELN AML 2022 Diagnosis
df_dx = df_train[~df_train['WHO 2022 Diagnosis'].isna()]

# exclude the classes with fewer than 10 samples
df_dx = df_dx[~df_dx['WHO 2022 Diagnosis'].isin([
                                       'MPAL with t(v;11q23.3)/KMT2A-r',
                                       'B-ALL with hypodiploidy',
                                       'AML with t(16;21); FUS::ERG',
                                       'AML with t(9;22); BCR::ABL1'
                                       ])]

### Select samples from COG AAML1031, 0531, and 03P1 Dx samples
df_cog = df[df['Clinical Trial'].isin(['AAML0531', 'AAML1031', 'AAML03P1'])]
df_cog = df_cog[df_cog['Sample Type'].isin(['Diagnosis', 'Primary Blood Derived Cancer - Bone Marrow',
                                            'Primary Blood Derived Cancer - Peripheral Blood'])]
df_cog = df_cog[~df_cog['Patient_ID'].duplicated(keep='last')]
Loading BokehJS ...

Interactive atlas#

Hide code cell source
plot_linked_scatters(df)

Patient Characteristics#

Foundation (unsupervised) model#

Hide code cell source
from tableone import TableOne
from datetime import date

columns = ['Hematopoietic Entity','Age (group years)','Sex',
            'Clinical Trial',]

mytable_cog = TableOne(df_train.reset_index(), columns,
                        overall=False, missing=True,
                        pval=False, pval_adjust=False,
                        htest_name=True,dip_test=True,
                        tukey_test=True, normal_test=True,

                        order={'FLT3 ITD':['Yes','No'],
                                'Age (group years)':['0-5','5-13','13-39','39-60'],
                                'MRD 1 Status': ['Positive'],
                                'Risk Group': ['High Risk', 'Standard Risk'],
                                'FLT3 ITD': ['Yes'],
                                'Leucocyte counts (10⁹/L)': ['≥30'],
                                'Age group (years)': ['≥10']})

mytable_cog.to_excel('../data/pt_characteristics_foundation_model_' + str(date.today()) +'.xlsx')

mytable_cog.tabulate(tablefmt="html", 
                        # headers=[score_name,"",'Missing','Discovery','Validation','p-value','Statistical Test']
                        )
Hide code cell output
Missing Overall
n 3308
Hematopoietic Entity, n (%)Acute lymphoblastic leukemia (ALL) 844 700 (28.4)
Acute myeloid leukemia (AML) 1207 (49.0)
Acute promyelocytic leukemia (APL) 31 (1.3)
Mixed phenotype acute leukemia (MPAL) 50 (2.0)
Myelodysplastic syndrome (MDS or MDS-like) 225 (9.1)
Otherwise-Normal (Control) 251 (10.2)
Age (group years), n (%) 0-5 1320 480 (24.1)
5-13 482 (24.2)
13-39 658 (33.1)
39-60 165 (8.3)
60+ 203 (10.2)
Sex, n (%) Female 1511 883 (49.1)
Male 914 (50.9)
Clinical Trial, n (%) AAML03P1 41 72 (2.2)
AAML0531 628 (19.2)
AAML1031 581 (17.8)
Beat AML Consortium 316 (9.7)
CCG2961 41 (1.3)
CETLAM SMD-09 (MDS-tAML) 166 (5.1)
French GRAALL 2003–2005 141 (4.3)
Japanese AML05 64 (2.0)
NOPHO ALL92-2000 933 (28.6)
TARGET ALL 131 (4.0)
TCGA AML 194 (5.9)

Fine-tuned (supervised) models#

Hide code cell source
columns = ['Age (years)','Age group (years)','Sex','Race or ethnic group',
            'Hispanic or Latino ethnic group', 'MRD 1 Status',
            'Leucocyte counts (10⁹/L)', 'BM leukemic blasts (%)',
            'Risk Group','FLT3 ITD', 'Clinical Trial']

df_test['Age (years)'] = df_test['Age (years)'].astype(float)

# join discovery clinical data with validation clinical data
all_cohorts = pd.concat([df_dx, df_px, df_test],
                         axis=0, keys=['AL Epigenomic Phenotype','AML Epigenomic Risk' ,'Validation'],
                         names=['cohort']).reset_index()

# columns = ['Age group (years)','Sex', 'MRD 1 Status',
#             'Leucocyte counts (10⁹/L)',
#             'Risk Group','FLT3 ITD', 'Treatment Arm','Clinical Trial']

mytable_cog = TableOne(all_cohorts, columns,
                        overall=False, missing=False,
                        pval=False, pval_adjust=False,
                        htest_name=True,dip_test=True,
                        tukey_test=True, normal_test=True,

                        order={'FLT3 ITD':['Yes','No'],
                                'Race or ethnic group':['White','Black or African American','Asian'],
                                'MRD 1 Status': ['Positive'],
                                'Risk Group': ['High Risk', 'Standard Risk'],
                                'FLT3 ITD': ['Yes'],
                                'Leucocyte counts (10⁹/L)': ['≥30'],
                                'Age group (years)': ['≥10']},
                                groupby='cohort')

mytable_cog.to_excel('../data/pt_characteristics_fine-tuned_models_' + str(date.today()) +'.xlsx')

mytable_cog.tabulate(tablefmt="html", 
                        # headers=[score_name,"",score_name,'Validation','p-value','Statistical Test']
)
Hide code cell output
AL Epigenomic Phenotype AML Epigenomic Risk Validation
n 2445 1844 201
Age (years), mean (SD) 19.3 (19.8) 19.5 (21.4) 8.8 (6.0)
Age group (years), n (%) ≥10 520 (47.2) 644 (48.2) 95 (47.7)
<10 581 (52.8) 693 (51.8) 104 (52.3)
Sex, n (%) Female 702 (50.4) 853 (49.2) 87 (43.3)
Male 691 (49.6) 879 (50.8) 114 (56.7)
Race or ethnic group, n (%) White 1052 (80.4) 1302 (80.4) 143 (71.9)
Black or African American 131 (10.0) 155 (9.6) 32 (16.1)
Asian 65 (5.0) 87 (5.4) 1 (0.5)
American Indian or Alaska Native 7 (0.5) 8 (0.5)
Native Hawaiian or other Pacific Islander7 (0.5) 10 (0.6) 2 (1.0)
Other 46 (3.5) 57 (3.5) 21 (10.6)
Hispanic or Latino ethnic group, n (%)Hispanic or Latino 204 (19.3) 245 (19.0) 25 (12.6)
Not Hispanic or Latino 851 (80.7) 1044 (81.0) 174 (87.4)
MRD 1 Status, n (%) Positive 282 (29.7) 361 (31.4) 76 (40.2)
Negative 667 (70.3) 787 (68.6) 113 (59.8)
Leucocyte counts (10⁹/L), n (%) ≥30 572 (52.4) 646 (48.9) 88 (44.0)
<30 520 (47.6) 676 (51.1) 112 (56.0)
BM leukemic blasts (%), mean (SD) 65.8 (24.1) 65.1 (24.2) 60.0 (25.6)
Risk Group, n (%) High Risk 195 (14.1) 299 (17.5) 51 (25.4)
Standard Risk 620 (44.9) 849 (49.7) 87 (43.3)
Low Risk 566 (41.0) 561 (32.8) 63 (31.3)
FLT3 ITD, n (%) Yes 179 (16.3) 248 (18.6) 31 (15.6)
No 920 (83.7) 1087 (81.4) 168 (84.4)
Clinical Trial, n (%) AAML03P1 62 (2.6) 72 (4.0)
AAML0531 510 (21.2) 628 (34.8)
AAML1031 489 (20.3) 581 (32.2)
Beat AML Consortium 192 (8.0) 225 (12.5)
CCG2961 31 (1.3) 41 (2.3)
CETLAM SMD-09 (MDS-tAML) 166 (6.9)
French GRAALL 2003–2005 141 (5.9)
Japanese AML05 9 (0.4) 15 (0.8)
NOPHO ALL92-2000 636 (26.5)
TARGET ALL 50 (2.1) 47 (2.6)
TCGA AML 118 (4.9) 194 (10.8)
AML02 159 (79.1)
AML08 42 (20.9)

By prognostic group#

Discovery#

Hide code cell source
def pt_characteristics_by_model(df, model_name, traintest = 'discovery'):
        columns = ['Age (years)','Age group (years)','Sex','Race or ethnic group',
                'Hispanic or Latino ethnic group', 'MRD 1 Status',
                'Leucocyte counts (10⁹/L)', 'BM leukemic blasts (%)',
                'Risk Group', 'Clinical Trial','FLT3 ITD', 'Treatment Arm']

        mytable_cog = TableOne(df, columns,
                                overall=False, missing=True,
                                pval=True, pval_adjust=False,
                                htest_name=True,dip_test=True,
                                tukey_test=True, normal_test=True,

                                order={'FLT3 ITD':['Yes','No'],
                                        'Race or ethnic group':['White','Black or African American','Asian'],
                                        'MRD 1 Status': ['Positive'],
                                        'Risk Group': ['High Risk', 'Standard Risk'],
                                        'FLT3 ITD': ['Yes'],
                                        'Leucocyte counts (10⁹/L)': ['≥30'],
                                        'Age group (years)': ['≥10']},
                                groupby=model_name)

        mytable_cog.to_excel('../data/pt_characteristics_'+ model_name +'_' + traintest + '_' + str(date.today()) + '.xlsx')

        return(mytable_cog.tabulate(tablefmt="html", 
                                headers=[model_name + ' ' + traintest,"",'Missing','High','Low','p-value','Statistical Test']))

pt_characteristics_by_model(df_px, model_name, 'discovery')
Hide code cell output
AML Epigenomic Risk discovery Missing High Low p-value Statistical Test
n 810 1034
Age (years), mean (SD) 65 23.0 (24.7)16.6 (17.9)<0.001 Two Sample T-test
Age group (years), n (%) ≥10 507 278 (48.9) 366 (47.7) 0.704 Chi-squared
<10 291 (51.1) 402 (52.3)
Sex, n (%) Female 112 363 (46.5) 490 (51.5) 0.046 Chi-squared
Male 417 (53.5) 462 (48.5)
Race or ethnic group, n (%) White 225 600 (81.1) 702 (79.9) 0.077 Chi-squared (warning: expected count < 5)
Black or African American 71 (9.6) 84 (9.6)
Asian 46 (6.2) 41 (4.7)
American Indian or Alaska Native 1 (0.1) 7 (0.8)
Native Hawaiian or other Pacific Islander 3 (0.4) 7 (0.8)
Other 19 (2.6) 38 (4.3)
Hispanic or Latino ethnic group, n (%)Hispanic or Latino 555 93 (17.0) 152 (20.5) 0.140 Chi-squared
Not Hispanic or Latino 453 (83.0) 591 (79.5)
MRD 1 Status, n (%) Positive 696 207 (43.3) 154 (23.0) <0.001 Chi-squared
Negative 271 (56.7) 516 (77.0)
Leucocyte counts (10⁹/L), n (%) ≥30 522 262 (46.9) 384 (50.3) 0.235 Chi-squared
<30 297 (53.1) 379 (49.7)
BM leukemic blasts (%), mean (SD) 236 67.2 (24.6)63.4 (23.8)0.002 Two Sample T-test
Risk Group, n (%) High Risk 135 212 (27.6) 87 (9.2) <0.001 Chi-squared
Standard Risk 487 (63.5) 362 (38.4)
Low Risk 68 (8.9) 493 (52.3)
Clinical Trial, n (%) AAML03P1 41 41 (5.1) 31 (3.1) <0.001 Chi-squared
AAML0531 249 (30.7) 379 (38.2)
AAML1031 242 (29.9) 339 (34.1)
Beat AML Consortium 120 (14.8) 105 (10.6)
CCG2961 27 (3.3) 14 (1.4)
Japanese AML05 10 (1.2) 5 (0.5)
TARGET ALL 14 (1.7) 33 (3.3)
TCGA AML 107 (13.2) 87 (8.8)
FLT3 ITD, n (%) Yes 509 127 (22.4) 121 (15.8) 0.003 Chi-squared
No 441 (77.6) 646 (84.2)
Treatment Arm, n (%) Arm A 1146 121 (41.7) 189 (46.3) 0.259 Chi-squared
Arm B 169 (58.3) 219 (53.7)

Validation#

Hide code cell source
pt_characteristics_by_model(df_test, model_name, 'validation')
Hide code cell output
AML Epigenomic Risk validation Missing High Low p-value Statistical Test
n 75 126
Age (years), mean (SD) 2 9.3 (6.0) 8.5 (6.0) 0.344 Two Sample T-test
Age group (years), n (%) ≥10 2 38 (51.4) 57 (45.6) 0.523 Chi-squared
<10 36 (48.6) 68 (54.4)
Sex, n (%) Female 0 32 (42.7) 55 (43.7) 1.000 Chi-squared
Male 43 (57.3) 71 (56.3)
Race or ethnic group, n (%) White 2 53 (72.6) 90 (71.4) 0.724 Chi-squared (warning: expected count < 5)
Black or African American 11 (15.1) 21 (16.7)
Asian 1 (1.4)
Native Hawaiian or other Pacific Islander 1 (1.4) 1 (0.8)
Other 7 (9.6) 14 (11.1)
Hispanic or Latino ethnic group, n (%)Hispanic or Latino 2 11 (14.9) 14 (11.2) 0.594 Chi-squared
Not Hispanic or Latino 63 (85.1) 111 (88.8)
MRD 1 Status, n (%) Positive 12 37 (51.4) 39 (33.3) 0.021 Chi-squared
Negative 35 (48.6) 78 (66.7)
Leucocyte counts (10⁹/L), n (%) ≥30 1 28 (37.8) 60 (47.6) 0.231 Chi-squared
<30 46 (62.2) 66 (52.4)
BM leukemic blasts (%), mean (SD) 21 65.1 (27.5)57.1 (24.1)0.051 Two Sample T-test
Risk Group, n (%) High Risk 0 28 (37.3) 23 (18.3) <0.001 Chi-squared
Standard Risk 38 (50.7) 49 (38.9)
Low Risk 9 (12.0) 54 (42.9)
Clinical Trial, n (%) AML02 0 61 (81.3) 98 (77.8) 0.674 Chi-squared
AML08 14 (18.7) 28 (22.2)
FLT3 ITD, n (%) Yes 2 14 (18.9) 17 (13.6) 0.425 Chi-squared
No 60 (81.1) 108 (86.4)
Treatment Arm, n (%) Arm A 2 41 (56.2) 66 (52.4) 0.713 Chi-squared
Arm B 32 (43.8) 60 (47.6)

Kaplan-Meier Plots#

Overall study population#

Hide code cell source
for dataset, trial in zip([df_cog, df_test], 
                          ['COG AML trials', 'Validation cohort']):
    draw_kaplan_meier(model_name=model_name,
                        df=dataset,
                        save_survival_table=False,
                        save_plot=False,
                        show_ci=False,
                        add_risk_counts=False,
                        trialname=trial,
                        figsize=(8,8))
Hide code cell output
../_images/a39149c99e670ef503a6517e30ceaa6e14a6542f3a9719270ae980f75a5dbdaa.png ../_images/b28029c7e42e0938e5880e8f725e51d6e537c64fad6352fae503ba6e74c23bb8.png

Per risk group#

Hide code cell source
for dataset, trial in zip([df_cog, df_test], ['COG AML trials', 'Validation cohort']):

    risk_groups = ['High Risk', 'Low Risk', 'Standard Risk']
    for risk_group in risk_groups:
        draw_kaplan_meier(
            model_name=model_name,
            df=dataset[dataset['Risk Group'] == risk_group],
            save_plot=False,
            save_survival_table=False,
            add_risk_counts=False,
            trialname=f'{trial} {risk_group}',
            figsize=(8, 8))
Hide code cell output
../_images/e2d00aa178986566e8271d5e4a79bd0afe453208ca64c8b86b174eee8d3a07db.png ../_images/021d94e69254aa2cf97ff8f3293b1a8e8a927d615e64997443b29b90884ff632.png ../_images/2fe5e981f3d9179c0aa6f908da69c2dd970f53a7b28055506d576c8d953065ee.png ../_images/b000e2a93c07b7606fc5eb743e2151828cf9c7b283b6cd8614e592cfc75367bc.png ../_images/487a6d098a7f9948906590c3305a85fb795f29a38d4eb36cd73edd789b309f05.png ../_images/6d1d8c12db6c9dad7a54dcca7d4b2daea71ac27302041ea0014e4887aeec6dff.png

Per risk group (AAML1831 COG)#

Hide code cell source
for dataset, trial in zip([df_cog],['COG AML trials']):

    risk_groups = ['High', 'Low', 'Standard']
    for risk_group in risk_groups:
        draw_kaplan_meier(
            model_name=model_name,
            df=dataset[dataset['Risk Group AAML1831'] == risk_group],
            save_plot=False,
            save_survival_table=False,
            add_risk_counts=False,
            trialname=f'{trial} {risk_group} Risk',
            figsize=(8, 8))
Hide code cell output
../_images/6cd7f44ed8d1f757c927768cf9f1a64a23b8e21d3697bc400a6ca6f3f38bca5d.png ../_images/babe96cdbe8892a8dfa5e1a1bbe80dee434a384bcee0739cb0969f16aa5d84f4.png ../_images/bcfd9a0fbb4f262f33285e70bc937999999cea33605661185aba400f794a82cf.png

Forest Plots#

With MRD 1#

Hide code cell source
for dataset, trial in zip([df_cog, df_test], ['COG AML trials', 'Validation cohort']):
    
    df_ = dataset.copy()
    df_['AML_Epigenomic_Risk'] = df_['AML Epigenomic Risk'] 

    draw_forest_plot(time='os.time',
                        event='os.evnt',
                        df=df_,
                        trialname=trial,
                        model_name='AML_Epigenomic_Risk',
                        save_plot=False)

    draw_forest_plot(time='efs.time',
                        event='efs.evnt',
                        df=df_,
                        trialname=trial,
                        model_name='AML_Epigenomic_Risk',
                        save_plot=False)
Hide code cell output
../_images/7e3f60d2e9fa9819d8b1ebd0313b915a62c48dd9f77b6478c1ce5ed61ae79e37.png ../_images/e346f2bb9e6432af65874830f5b88424987732d45bba2dfc0fcdf6cf95557bca.png ../_images/5eed1eca34177815868a9e34d872b4197643e83e9018b181dab450234dc7b4c4.png ../_images/5391542a88ec3d64309b6c53632f134d933a37dc2fe3a10a94129aead6ad7d8a.png

With MRD 1 and BM blast (%)#

Hide code cell source
for dataset, trial in zip([df_cog, df_test], ['COG AML trials', 'Validation cohort']):
    
    df_ = dataset.copy()
    df_['BM leukemic blasts (%)'] = pd.cut(df_['BM leukemic blasts (%)'], bins=[0,50,100], labels=['≤50', '>50'])
    df_['AML_Epigenomic_Risk'] = df_['AML Epigenomic Risk'] 

    draw_forest_plot_withBMblast(time='os.time',
                        event='os.evnt',
                        df=df_,
                        trialname=trial,
                        model_name='AML_Epigenomic_Risk',
                        save_plot=False)

    draw_forest_plot_withBMblast(time='efs.time',
                        event='efs.evnt',
                        df=df_,
                        trialname=trial,
                        model_name='AML_Epigenomic_Risk',
                        save_plot=False)
Hide code cell output
../_images/bd3a0ea4f874fa8c61d5f975627805de25be5873c092a55f683dee16eaddaffe.png ../_images/28ffc619800e306f043c354784ec462948356eeb06415ffea24f9258f3b60294.png ../_images/3a469c89d2329e7a0d3f000b982bbc62f5409a332e196afcc6adc453798fb43f.png ../_images/a1ee19d307609327c09e0b13e576421e2496142d679897ed6f466d007f2152b9.png

Without MRD 1#

Hide code cell source
for dataset, trial in zip([df_cog, df_test], ['COG AML trials', 'Validation cohort']):
    
    df_ = dataset.copy()
    df_['BM leukemic blasts (%)'] = pd.cut(df_['BM leukemic blasts (%)'], bins=[0,50,100], labels=['≤50', '>50'])
    df_['AML_Epigenomic_Risk'] = df_['AML Epigenomic Risk'] 

    draw_forest_plot_noMRD(time='os.time',
                        event='os.evnt',
                        df=df_,
                        trialname=trial,
                        model_name='AML_Epigenomic_Risk',
                        save_plot=False)

    draw_forest_plot_noMRD(time='efs.time',
                        event='efs.evnt',
                        df=df_,
                        trialname=trial,
                        model_name='AML_Epigenomic_Risk',
                        save_plot=False)
Hide code cell output
../_images/8c152bf1d325c2940b8e41b740e1c668104675aab1604d1ae0f549ceed53f191.png ../_images/86ba901dd0a9d11e24f32461ba80b4cadb875a5d03a02e07f8399fe9e974d85c.png ../_images/073b32f169be568135a088b58ef3e6d73370b271ddd217aef1fb49b56c27296d.png ../_images/cd8494d979475ef38734f70f0c5442b12fc2048dd64926212a85ec60f8bfe7a0.png

ROC AUC performance#

def plot_roc_auc(df, target, model_name, risk_group='Risk Group', title=None, sum_models=False):
    """
    Plots ROC AUC flexibly using Bokeh.
    
    Parameters:
    - df: pandas DataFrame containing model predictions as columns and actual target variable.
    - target: Name of the column containing the actual target variable.
    - model_name: Name of the column containing the model predictions.
    - risk_group: Name of the column containing the risk group.
    - title: Title of the plot.
    """

    def category_to_integer(df, model_name, risk_group=None, sum_models=sum_models):

        df_ = df.copy()        
        low_high_dict = {'Low': 0, 'Low Risk': 0,
                        'Standard':0.5, 'Standard Risk': 0.5,
                        'High': 1, 'High Risk': 1}

        if df[model_name].dtype == 'O':
            df_[model_name] = df_[model_name].map(low_high_dict)
            
        df_[risk_group] = df_[risk_group].map(low_high_dict)

        if sum_models:
            df_[model_name + ' + ' + risk_group] = (df_[model_name] + df_[risk_group])/2
            df_ = df_[[model_name + ' + ' + risk_group, target]]
        else:
            df_ = df_[[model_name, risk_group, target]]

        # drop rows with missing values
        df_ = df_.dropna()

        return df_


    df = category_to_integer(df, model_name, risk_group=risk_group)

    
    # colors = itertools.cycle(Spectral11)
    colors = ['navy', 'firebrick', 'olive']

    if title:
        title_ = title + ', n=' + str(len(df))
    else:
        title_ = ''

    p = figure(title=title_,
               x_axis_label='False Positive Rate',
               y_axis_label='True Positive Rate',
               width=325, height=325,
               tools='save,reset,pan')
    
    p.line([0, 1], [0, 1], line_dash="dashed", color="gray", line_width=1)

    for column, color in zip(df.columns.difference([target]), colors):
        fpr, tpr, _ = roc_curve(df[target], df[column])
        roc_auc = auc(fpr, tpr)
        p.line(fpr, tpr, legend_label=f"{column}\nAUC = {roc_auc:.2f}",
               color=color, line_width=2, alpha=0.8)

    p.legend.location = "bottom_right"
    p.legend.click_policy="hide"
    p.toolbar.logo = None
    p.legend.label_text_font_size = '8pt'
    p.legend.spacing = 2
    p.xaxis.axis_label_text_font_style = "normal"
    p.yaxis.axis_label_text_font_style = "normal"
    p.legend.background_fill_alpha = 0.8
    p.title.text_font_size = '9pt'

    return p

AML epigenomic risk (probability) + risk group#

Hide code cell source
# Probability model
model_name = 'P(High Risk)'
p1 = plot_roc_auc(df_px, 'os.evnt', model_name , title='Discovery cohort')
p2 = plot_roc_auc(df_cog, 'os.evnt', model_name, title='Discovery COG peds AML Dx')
p3 = plot_roc_auc(df_test, 'os.evnt', model_name, title='Validation cohort')


p4 = plot_roc_auc(df_px, 'os.evnt', model_name , sum_models=True)
p5 = plot_roc_auc(df_cog, 'os.evnt', model_name, sum_models=True)
p6 = plot_roc_auc(df_test, 'os.evnt', model_name, sum_models=True)

# Create a gridplot
p = gridplot([
    [p1, p2, p3,],
    [p4, p5, p6,],
    ], toolbar_location='above')

show(p)
Hide code cell output

Note

Sample size may be reduced in the ROC AUC because samples with missing risk group data were removed.

AML epigenomic risk (high-low) + risk group#

Hide code cell source
# Binary model
model_name = 'AML Epigenomic Risk'
p1 = plot_roc_auc(df_px, 'os.evnt', model_name , title='Discovery cohort')
p2 = plot_roc_auc(df_cog, 'os.evnt', model_name, title='Discovery COG peds AML Dx')
p3 = plot_roc_auc(df_test, 'os.evnt', model_name, title='Validation cohort')


p4 = plot_roc_auc(df_px, 'os.evnt', model_name , sum_models=True)
p5 = plot_roc_auc(df_cog, 'os.evnt', model_name, sum_models=True)
p6 = plot_roc_auc(df_test, 'os.evnt', model_name, sum_models=True)

# Create a gridplot
p = gridplot([
    [p1, p2, p3,],
    [p4, p5, p6,],
    ], toolbar_location='above')

show(p)
Hide code cell output

AML epigenomic risk + latest risk group (AAML1831 COG)#

Hide code cell source
# Probability model
model_name = 'P(High Risk)'
p1 = plot_roc_auc(df_cog, 'os.evnt', model_name ,risk_group='Risk Group' ,title='Risk group AAML1031-0531')
p2 = plot_roc_auc(df_cog, 'os.evnt', model_name, risk_group='Risk Group AAML1831' ,title='Risk group AAML1831')
p3 = plot_roc_auc(df_cog, 'os.evnt', model_name, risk_group='Risk Group AAML1831', sum_models=True, title='Risk group AAML1831 + Epigenomic Risk')

# Binary model
model_name = 'AML Epigenomic Risk'
p4 = plot_roc_auc(df_cog, 'os.evnt', model_name ,risk_group='Risk Group')
p5 = plot_roc_auc(df_cog, 'os.evnt', model_name, risk_group='Risk Group AAML1831')
p6 = plot_roc_auc(df_cog, 'os.evnt', model_name, risk_group='Risk Group AAML1831', sum_models=True)

# Create a gridplot
p = gridplot([
    [p1, p2, p3,],
    [p4, p5, p6,],
    ], toolbar_location='above')

show(p)
Hide code cell output

AL epigenomic phenotype#

Hide code cell source
## TODO
# from sklearn.metrics import roc_curve, auc
# from sklearn.preprocessing import label_binarize
# from itertools import cycle
# from bokeh.plotting import figure, show
# from bokeh.io import output_notebook
# import pandas as pd
# import numpy as np

# def plot_multiclass_roc_auc(df, target, model_names, title=None):
#     """
#     Plots ROC AUC for a multiclass classifier using Bokeh, handling non-integer class labels.
    
#     Parameters:
#     - df: pandas DataFrame containing model predictions as columns and actual target variable.
#     - target: Name of the column containing the actual target variable.
#     - model_names: List of column names containing the model predictions for each class.
#     - title: Title of the plot.
#     """
    
#     # Convert target to binary (one-hot encoding)
#     classes = df[target].unique()
#     y = label_binarize(df[target], classes=classes)
#     n_classes = y.shape[1]

#     # Setup plot
#     p = figure(title=title,
#                x_axis_label='False Positive Rate',
#                y_axis_label='True Positive Rate',
#                width=1500, height=500,
#                tools='save,reset,pan,zoom_in,zoom_out')
#     p.line([0, 1], [0, 1], line_dash="dashed", color="gray", line_width=1)

#     # Colors for each line
#     colors = get_custom_color_palette()

#     for i, color in zip(range(n_classes), colors):
#         # Prepare true and predicted values
#         true = y[:, i]
#         # Assuming each model_name is now a single column with prediction probabilities for class i
#         for model_name in model_names:
#             predicted = df[model_name]  # Corrected to directly access model predictions
            
#             fpr, tpr, _ = roc_curve(true, predicted)
#             roc_auc = auc(fpr, tpr)
#             p.line(fpr, tpr, legend_label=f"Class {i} ({model_name}) AUC = {roc_auc:.2f}",
#                    color=color, line_width=2)

#     p.legend.location = "bottom_right"
#     p.legend.click_policy="hide"
#     p.toolbar.logo = None
#     p.legend.label_text_font_size = '8pt'

#     return p

# multiclass_matrix = df_dx[['WHO 2022 Diagnosis', 'AL Epigenomic Phenotype_int']]

# plot = plot_multiclass_roc_auc(df = multiclass_matrix, target = 'WHO 2022 Diagnosis', 
#                                model_names = ['AL Epigenomic Phenotype_int'])
# show(plot)

Box Plots#

Hide code cell source
draw_boxplot(df=df_test,x='Risk Group', y='P(High Risk)',
                order=['High Risk', 'Standard Risk', 'Low Risk'],
                trialname='StJude trials', hue=model_name,
                save_plot=False, figsize=(4,3))

draw_boxplot(df=df_test,x='MRD 1 Status', y='P(High Risk)',
                order=['Positive','Negative'],
                trialname='StJude trials', hue=model_name,
                save_plot=False, figsize=(4,3))

draw_boxplot(df=df_test,x='Primary Cytogenetic Code', y='P(High Risk)',
                order='auto',
                trialname='StJude trials', hue=model_name,
                save_plot=False, figsize=(4,3))
Hide code cell output
../_images/19ab78066396d074cae34a10270a84492abd9dab583dee907c3dc07f8f81030e.png ../_images/a082ebd6c8b9dca37e767c61bc0acb30d75dcb7fb673f7e191bf2ce53b62272f.png ../_images/d94fab21268622a8a3b49d922ab84c7d2e0086d3e2fcc2bb8369af1f552574b7.png

Stacked Bar Plots#

Hide code cell source
model_name = 'AML Epigenomic Risk'
draw_stacked_barplot(df=df_test,x='MRD 1 Status', y=model_name,
             order=['Positive','Negative'],
             trialname='StJude trials', hue=model_name,
             save_plot=False, figsize=(4,3))

draw_stacked_barplot(df=df_test,x='Risk Group', y=model_name,
                order=['High Risk', 'Standard Risk', 'Low Risk'],
                trialname='StJude trials', hue=model_name,
                save_plot=False, figsize=(4,3), fontsize=9)

draw_stacked_barplot(df=df_test,x='Primary Cytogenetic Code', y=model_name,
                order='auto',
                trialname='StJude trials', hue=model_name,
                save_plot=False, figsize=(4,3), fontsize=6)
Hide code cell output
../_images/5d0afe0830e6b734fcc2581754aaced87dfefa5d707c88d51d2e1b5f032a7452.png ../_images/9ade8b5d4d4af61093749e00c2ba8b98bd9ada4f0291a2b9e8d5c76a454a6050.png ../_images/e234a1ae274873ddd4a946761ff72da29b987a841139cb344f0a7ba20d226450.png

Sankey plots#

## TODO

Watermark#